Lab_11

Author

Tyler Gallagher

1. Reading and Processing Data

cv_states <- as.data.frame(read.csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"))
state_pops <- as.data.frame(read.csv("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv"))
state_pops$abb <- state_pops$state
state_pops$state <- state_pops$state_name
state_pops$state_name <- NULL
cv_states <- merge(cv_states, state_pops, by = "state")

2. Look at the data

dim(cv_states)
[1] 58094     9
head(cv_states)
    state       date fips   cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04    1 1587224  21263      1    4887871    96.50939  AL
2 Alabama 2020-04-25    1    6213    213      1    4887871    96.50939  AL
3 Alabama 2023-02-26    1 1638348  21400      1    4887871    96.50939  AL
4 Alabama 2022-12-03    1 1549285  21129      1    4887871    96.50939  AL
5 Alabama 2020-05-06    1    8691    343      1    4887871    96.50939  AL
6 Alabama 2021-04-21    1  524367  10807      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11   56 175290   1884     56     577737    5.950611  WY
58090 Wyoming 2022-08-21   56 173487   1871     56     577737    5.950611  WY
58091 Wyoming 2021-01-26   56  51152    596     56     577737    5.950611  WY
58092 Wyoming 2021-02-21   56  53795    662     56     577737    5.950611  WY
58093 Wyoming 2021-08-22   56  70671    809     56     577737    5.950611  WY
58094 Wyoming 2021-03-20   56  55581    693     56     577737    5.950611  WY
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ date       : chr  "2023-01-04" "2020-04-25" "2023-02-26" "2022-12-03" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
 $ deaths     : int  21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : chr  "AL" "AL" "AL" "AL" ...

3. Format the data

cv_states$date <- as.Date(cv_states$date, format="%Y-%m-%d")
state_list <- unique(cv_states$state)
cv_states$state <- factor(cv_states$state, levels = state_list)
abb_list <- unique(cv_states$abb)
cv_states$abb <- factor(cv_states$abb, levels = abb_list)
cv_states <- cv_states[order(cv_states$state, cv_states$date),]
str(cv_states)
'data.frame':   58094 obs. of  9 variables:
 $ state      : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ date       : Date, format: "2020-03-13" "2020-03-14" ...
 $ fips       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ cases      : int  6 12 23 29 39 51 78 106 131 157 ...
 $ deaths     : int  0 0 0 0 0 0 0 0 0 0 ...
 $ geo_id     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ population : int  4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
 $ pop_density: num  96.5 96.5 96.5 96.5 96.5 ...
 $ abb        : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
head(cv_states)
       state       date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13    1     6      0      1    4887871    96.50939  AL
597  Alabama 2020-03-14    1    12      0      1    4887871    96.50939  AL
282  Alabama 2020-03-15    1    23      0      1    4887871    96.50939  AL
12   Alabama 2020-03-16    1    29      0      1    4887871    96.50939  AL
266  Alabama 2020-03-17    1    39      0      1    4887871    96.50939  AL
78   Alabama 2020-03-18    1    51      0      1    4887871    96.50939  AL
tail(cv_states)
        state       date fips  cases deaths geo_id population pop_density abb
57902 Wyoming 2023-03-18   56 185640   2009     56     577737    5.950611  WY
57916 Wyoming 2023-03-19   56 185640   2009     56     577737    5.950611  WY
57647 Wyoming 2023-03-20   56 185640   2009     56     577737    5.950611  WY
57867 Wyoming 2023-03-21   56 185800   2014     56     577737    5.950611  WY
58057 Wyoming 2023-03-22   56 185800   2014     56     577737    5.950611  WY
57812 Wyoming 2023-03-23   56 185800   2014     56     577737    5.950611  WY
summary(cv_states)
           state            date                 fips           cases         
 Washington   : 1158   Min.   :2020-01-21   Min.   : 1.00   Min.   :       1  
 Illinois     : 1155   1st Qu.:2020-12-06   1st Qu.:16.00   1st Qu.:  112125  
 California   : 1154   Median :2021-09-11   Median :29.00   Median :  418120  
 Arizona      : 1153   Mean   :2021-09-10   Mean   :29.78   Mean   :  947941  
 Massachusetts: 1147   3rd Qu.:2022-06-17   3rd Qu.:44.00   3rd Qu.: 1134318  
 Wisconsin    : 1143   Max.   :2023-03-23   Max.   :72.00   Max.   :12169158  
 (Other)      :51184                                                          
     deaths           geo_id        population        pop_density       
 Min.   :     0   Min.   : 1.00   Min.   :  577737   Min.   :    1.292  
 1st Qu.:  1598   1st Qu.:16.00   1st Qu.: 1805832   1st Qu.:   43.659  
 Median :  5901   Median :29.00   Median : 4468402   Median :  107.860  
 Mean   : 12553   Mean   :29.78   Mean   : 6397965   Mean   :  423.031  
 3rd Qu.: 15952   3rd Qu.:44.00   3rd Qu.: 7535591   3rd Qu.:  229.511  
 Max.   :104277   Max.   :72.00   Max.   :39557045   Max.   :11490.120  
                                                     NA's   :1106       
      abb       
 WA     : 1158  
 IL     : 1155  
 CA     : 1154  
 AZ     : 1153  
 MA     : 1147  
 WI     : 1143  
 (Other):51184  
min(cv_states$date)
[1] "2020-01-21"
max(cv_states$date)
[1] "2023-03-23"

4. Add new cases, new deaths, and correct outliers

library(plotly)
Loading required package: ggplot2

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(zoo)

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
for (i in 1:length(state_list)) {
  cv_subset <- subset(cv_states, state == state_list[i])
  cv_subset <- cv_subset[order(cv_subset$date),]

  cv_subset$new_cases = cv_subset$cases[1]
  cv_subset$new_deaths = cv_subset$deaths[1]


  for (j in 2:nrow(cv_subset)) {
    cv_subset$new_cases[j] = cv_subset$cases[j] - cv_subset$cases[j - 1]
    cv_subset$new_deaths[j] = cv_subset$deaths[j] - cv_subset$deaths[j - 1]
  }

  cv_states$new_cases[cv_states$state == state_list[i]] = cv_subset$new_cases
  cv_states$new_deaths[cv_states$state == state_list[i]] = cv_subset$new_deaths
}
cv_states <- cv_states %>% dplyr::filter(date >= "2021-06-01")
p1 <- ggplot(cv_states, aes(x = date, y = new_cases, color = state)) +
  geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p1)
p1 <- NULL

p2 <- ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
  geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)
p2 <- NULL
cv_states$new_cases[cv_states$new_cases < 0] = 0
cv_states$new_deaths[cv_states$new_deaths < 0] = 0
for (i in 1:length(state_list)) {
  cv_subset <- subset(cv_states, state == state_list[i])
  cv_subset$cases[1] = cv_subset$new_cases[1]
  cv_subset$deaths[1] = cv_subset$new_deaths[1]
  for (j in 2:nrow(cv_subset)) {
    cv_subset$cases[j] = cv_subset$new_cases[j] + cv_subset$cases[j - 1]
    cv_subset$deaths[j] = cv_subset$new_deaths[j] + cv_subset$deaths[j - 1]
  }
  cv_states$cases[cv_states$state == state_list[i]] = cv_subset$cases
  cv_states$deaths[cv_states$state == state_list[i]] = cv_subset$deaths
}
cv_states$new_cases = zoo::rollmean(cv_states$new_cases, k = 7, fill = NA, align = 'right') %>% round(digits = 0)
cv_states$new_deaths = zoo::rollmean(cv_states$new_deaths, k = 7, fill = NA, align = 'right') %>% round(digits = 0)
p2 <- ggplot(cv_states, aes(x = date, y = new_deaths, color = state)) +
  geom_line() + geom_point(size = .5, alpha = 0.5)
ggplotly(p2)

5. Add additional variables

# add population-normalized (by 100,000) counts for each variable
cv_states$per100k = as.numeric(format(round(cv_states$cases / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newper100k = as.numeric(format(round(cv_states$new_cases / (cv_states$population / 100000), 1), nsmall = 1))
Warning: NAs introduced by coercion
cv_states$deathsper100k = as.numeric(format(round(cv_states$deaths / (cv_states$population / 100000), 1), nsmall = 1))
cv_states$newdeathsper100k = as.numeric(format(round(cv_states$new_deaths / (cv_states$population / 100000), 1), nsmall = 1))
Warning: NAs introduced by coercion
# add a naive_CFR variable = deaths / cases
cv_states = cv_states %>% mutate(naive_CFR = round((deaths * 100 / cases), 2))

# create a `cv_states_today` variable
cv_states_today = subset(cv_states, date == max(cv_states$date))

6. Explore scatterplots using plotly

library("plotly")
cv_states_today_filter <- cv_states_today %>% filter(state != "District of Columbia")

# Continue with the plot_ly code
cv_states_today_filter %>% 
  plot_ly(x = ~pop_density, y = ~deathsper100k,
          type = 'scatter', mode = 'markers', color = ~state,
          size = ~population, sizes = c(5, 70), marker = list(sizemode='diameter', opacity=0.5),
          hoverinfo = 'text',
          text = ~paste(paste(state, ":", sep=""), paste(" Cases per 100k: ", per100k, sep=""), 
                        paste(" Deaths per 100k: ", deathsper100k, sep=""), sep = "<br>")) %>%
  layout(title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states",
         yaxis = list(title = "Deaths per 100k"), xaxis = list(title = "Population Density"),
         hovermode = "compare")
Warning: Ignoring 1 observations
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

7. Explore scatterplot trend interactively using ggplotly and geom_smooth

cv_states_today_scatter <- cv_states_today_filter %>% 
  select(pop_density, deathsper100k, population)

# Continue with the ggplotly() code
p <- ggplot(cv_states_today_scatter, aes(x=pop_density, y=deathsper100k, size=population)) + 
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Add linear regression line
  labs(title = "Population Density vs. New Deaths per 100k",
       x = "Population Density",
       y = "New Deaths per 100k") +
  theme_minimal()

# Convert ggplot to interactive plot using ggplotly()
p <- ggplotly(p)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
  the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
  variable into a factor?
p

8. Multiple Line Chart

plot_ly(cv_states, x = ~date, y = ~naive_CFR, color = ~state, type = "scatter", mode = "lines")
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors

Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
# Line chart for Florida showing new_cases and new_deaths together
cv_states %>%
  filter(state == "Florida") %>%
  plot_ly(x = ~date, y = ~new_cases, type = "scatter", mode = "lines", name = "New Cases") %>%
  add_lines(y = ~new_deaths, name = "New Deaths", opacity = 0.7, text = ~paste("Deaths: ", new_deaths)) %>%
  layout(title = "New Cases and Deaths Over Time in Florida",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Count"),
         hovermode = "compare")

9. Heatmaps

# Map state, date, and new_cases to a matrix
library(tidyr)

cv_states_mat <- cv_states %>% 
  select(state, date, new_cases) %>% 
  filter(date > as.Date("2021-06-01"))

cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = new_cases))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
heatmap_plot <- plot_ly(
  x = colnames(cv_states_mat2), 
  y = rownames(cv_states_mat2),
  z = ~cv_states_mat2,
  type = "heatmap",
  showscale = TRUE
)

heatmap_plot
# Repeat with newper100k
cv_states_mat <- cv_states %>% 
  select(state, date, newper100k) %>% 
  filter(date > as.Date("2021-06-01"))

cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
heatmap_plot_newper100k <- plot_ly(
  x = colnames(cv_states_mat2), 
  y = rownames(cv_states_mat2),
  z = ~cv_states_mat2,
  type = "heatmap",
  showscale = TRUE
)

heatmap_plot_newper100k
# Create a second heatmap after filtering to only include dates every other week
filter_dates <- seq(as.Date("2021-06-15"), as.Date("2021-11-01"), by = "2 weeks")

cv_states_mat <- cv_states %>% 
  select(state, date, newper100k) %>% 
  filter(date %in% filter_dates)

cv_states_mat2 <- as.data.frame(pivot_wider(cv_states_mat, names_from = state, values_from = newper100k))
rownames(cv_states_mat2) <- cv_states_mat2$date
cv_states_mat2$date <- NULL
cv_states_mat2 <- as.matrix(cv_states_mat2)

# Create a heatmap using plot_ly()
heatmap_plot_filtered <- plot_ly(
  x = colnames(cv_states_mat2), 
  y = rownames(cv_states_mat2),
  z = ~cv_states_mat2,)

10. Map

# For specified date
pick.date <- "2021-10-15"

# Extract the data for each state by its abbreviation
cv_per100 <- cv_states %>% 
  filter(date == pick.date) %>% 
  select(state, abb, naive_CFR, cases, deaths) %>%
  mutate(hover = paste(state, '<br>', "Naive CFR: ", naive_CFR, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Set up mapping details
set_map_details <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

# Create the map for specified date
fig_pick.date <- plot_geo(cv_per100, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~naive_CFR, text = ~hover, locations = ~state,
    color = ~naive_CFR, colors = 'Reds'
  ) %>%
  colorbar(title = paste("Naive CFR as of ", pick.date))

fig_pick.date <- fig_pick.date %>% layout(
  title = paste('Naive CFR by State as of ', pick.date, '<br>(Hover for value)'),
  geo = set_map_details
)

#############
### Map for today's date

# Extract the data for each state by its abbreviation
cv_per100_today <- cv_states_today %>%  
  select(state, abb, naive_CFR, cases, deaths) %>%
  mutate(hover = paste(state, '<br>', "Naive CFR: ", naive_CFR, '<br>', "Cases: ", cases, '<br>', "Deaths: ", deaths))

# Create the map for today's date
fig_Today <- plot_geo(cv_per100_today, locationmode = 'USA-states') %>% 
  add_trace(
    z = ~naive_CFR, text = ~hover, locations = ~state,
    color = ~naive_CFR, colors = 'Reds'
  ) %>%
  colorbar(title = paste("Naive CFR as of ", Sys.Date()))

fig_Today <- fig_Today %>% layout(
  title = paste('Naive CFR by State as of', Sys.Date(), '<br>(Hover for value)'),
  geo = set_map_details
)

### Plot together 
subplot(fig_pick.date, fig_Today, nrows = 2, margin = 0.05)